if (!require("pacman")) install.packages("pacman")
## Indlæser krævet pakke: pacman
if (!require("aamisc")){
pacman::p_load("qvalue", "rain", "limma", "devtools")
url <- "https://cran.r-project.org/src/contrib/Archive/HarmonicRegression/HarmonicRegression_1.0.tar.gz"
pkgFile <- "HarmonicRegression_1.0.tar.gz"
download.file(url = url, destfile = pkgFile)
install.packages(pkgs=pkgFile, type="source", repos=NULL)
file.remove(pkgFile)
}
## Indlæser krævet pakke: aamisc
pacman::p_load(
"edgeR", "readr", "readxl", "biomaRt", "magrittr", "tibble", "stringr",
"ggplot2", "data.table", "patchwork", "openxlsx", "aamisc", "limma",
"devtools", "dplyr", "RColorBrewer", "ggrepel", "ComplexUpset", "tidyr")
# Color scheme and shapes for figures
RA_colors <- c("#003049", "#0073af")
LA_colors <- c("#d62828", "#e77d7d")
RV_colors <- c("#ff7f00", "#ffb266")
LV_colors <- c("#fcbf49", "#fee2ad")
# Shape 16 (circle) for Control and Shape 17 (triangle) for AF
condition_shapes <- c("Control" = 16, "AF" = 17)
chamber_colors <- c("RA" = "#003049", "LA" = "#d62828", "RV" = "#ff7f00", "LV" = "#fcbf49")
chamber_fill_colors <- c("RA" = "#0073af", "LA" = "#e77d7d", "RV" = "#ffb266", "LV" = "#fee2ad")
# Load count data from a local TSV file and metadata through Excel
count_file <- "../../../data/count/gene-expression-all-reverse-stranded-countReadPairs.tsv"
count <- readr::read_delim(count_file)
## Rows: 38849 Columns: 54
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): Geneid, Chr, Start, End, Strand
## dbl (49): Length, Dorado_LA_post, Dorado_LVFW, Dorado_RA_post, Dorado_RVFW, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
meta_file <- "../../../data/metadata/meta.xlsx"
meta <- readxl::read_excel(meta_file)
# Gene annotation
geneinfo_file <- "../../../data/gene_annotation/horse_gene_annotation.tsv.gz"
geneinfo <- fread(geneinfo_file)
colnames(geneinfo)
## [1] "Gene stable ID"
## [2] "Gene stable ID version"
## [3] "Gene description"
## [4] "Chromosome/scaffold name"
## [5] "Gene start (bp)"
## [6] "Gene end (bp)"
## [7] "Strand"
## [8] "Gene name"
## [9] "NCBI gene (formerly Entrezgene) ID"
## [10] "NCBI gene (formerly Entrezgene) description"
# Rename columns
setnames(geneinfo, new = c("ENSEMBL", "ENSEMBLv", "Description_detailed",
"Chr", "Start", "End", "Strand", "GENENAME", "ENTREZID", "Description"))
# Remove unnecessary columns
geneinfo <- geneinfo[, c("ENSEMBLv", "Description_detailed") := NULL]
# Remove duplicate entries based on ENSEMBL ID
geneinfo <- geneinfo[!duplicated(ENSEMBL), ]
# Convert annotation data to data frame and set rownames to ENSEMBL IDs
annot <- merge(x = count[,c("Geneid", "Length")],
y = geneinfo,
by.x = "Geneid",
by.y = "ENSEMBL",
all.x = TRUE,
all.y = FALSE)
setnames(annot, old = "Geneid", new = "ENSEMBL")
annot <- data.frame(annot)
rownames(annot) <- annot$ENSEMBL
fwrite(x = annot,
file = "../../../data/gene_annotation/horse_gene_annotation_filtered.tsv.gz",
sep = "\t")
# Clean metadata and ensure it matches the count data
meta <- meta %>% remove_rownames %>% column_to_rownames(var="Sample_ID")
head(meta)
## Horse Group Region Training status Condition
## Dorado_LA_post Dorado AF LA Unknown LA_AF
## Dorado_RA_post Dorado AF RA Unknown RA_AF
## San_Diego_LA_post San Diego AF LA Unknown LA_AF
## San_Diego_RA_post San Diego AF RA Unknown RA_AF
## Kevin_Cook_LA_post Kevin Cook AF LA Unknown LA_AF
## Kevin_Cook_RA_post Kevin Cook AF RA Unknown RA_AF
count <- count[, -c(2:6)]
# Set "Geneid" as rownames and remove from the data frame for easy manipulation
count <- count %>% remove_rownames %>% column_to_rownames(var="Geneid")
head(count)
## Dorado_LA_post Dorado_LVFW Dorado_RA_post Dorado_RVFW
## ENSECAG00000013298 109 267 195 246
## ENSECAG00000019402 0 0 0 0
## ENSECAG00000052423 573 844 943 904
## ENSECAG00000011978 2 13 7 10
## ENSECAG00000005166 143 233 206 273
## ENSECAG00000032916 383 442 623 397
## Im_A_Mets_Fan_LA_post Im_A_Mets_Fan_LVFW
## ENSECAG00000013298 204 255
## ENSECAG00000019402 0 0
## ENSECAG00000052423 1094 1080
## ENSECAG00000011978 21 6
## ENSECAG00000005166 249 262
## ENSECAG00000032916 439 530
## Im_a_Mets_Fan_RA_post Im_a_Mets_Fan_RVFW Jytte_LA_post
## ENSECAG00000013298 259 195 241
## ENSECAG00000019402 0 0 0
## ENSECAG00000052423 931 879 1139
## ENSECAG00000011978 12 4 11
## ENSECAG00000005166 323 212 226
## ENSECAG00000032916 794 438 604
## Jytte_LVFW Jytte_RA_post Jytte_RVFW Kevin_Cook_LA_post
## ENSECAG00000013298 309 320 339 296
## ENSECAG00000019402 0 0 0 0
## ENSECAG00000052423 1051 897 1088 1608
## ENSECAG00000011978 11 13 15 15
## ENSECAG00000005166 299 313 302 316
## ENSECAG00000032916 463 548 494 747
## Kevin_Cook_LVFW Kevin_Cook_RA_post Kevin_Cook_RVFW
## ENSECAG00000013298 316 284 323
## ENSECAG00000019402 0 0 0
## ENSECAG00000052423 1341 1089 1060
## ENSECAG00000011978 11 17 5
## ENSECAG00000005166 218 332 246
## ENSECAG00000032916 635 659 642
## MAP11_LA_post MAP11_LV_midwall MAP11_RApost MAP11_RV_endo
## ENSECAG00000013298 215 330 223 230
## ENSECAG00000019402 0 0 0 0
## ENSECAG00000052423 1015 1022 918 646
## ENSECAG00000011978 13 16 9 14
## ENSECAG00000005166 299 244 374 239
## ENSECAG00000032916 449 497 474 343
## MAP3_LA_post MAP3_LV_midwall MAP3_RApost MAP3_RV_endo
## ENSECAG00000013298 239 328 240 266
## ENSECAG00000019402 0 0 0 0
## ENSECAG00000052423 877 1180 876 784
## ENSECAG00000011978 6 9 9 12
## ENSECAG00000005166 300 287 350 258
## ENSECAG00000032916 419 514 469 313
## MAP4_LA_post MAP4_LV_midwall MAP4_RApost MAP4_RV_endo
## ENSECAG00000013298 189 310 283 254
## ENSECAG00000019402 0 0 0 0
## ENSECAG00000052423 1058 1026 925 910
## ENSECAG00000011978 9 17 21 11
## ENSECAG00000005166 323 306 414 225
## ENSECAG00000032916 460 577 569 395
## MAP6_LA_post MAP6_LV_midwall MAP6_RApost MAP6_RV_endo
## ENSECAG00000013298 316 319 238 279
## ENSECAG00000019402 0 0 0 0
## ENSECAG00000052423 1038 765 765 617
## ENSECAG00000011978 11 10 12 13
## ENSECAG00000005166 365 251 315 255
## ENSECAG00000032916 482 379 380 275
## MAP8_LA_post MAP8_RV_endo MAP8_RApost MAP8_LV_midwall
## ENSECAG00000013298 252 267 191 246
## ENSECAG00000019402 0 0 0 0
## ENSECAG00000052423 1486 717 1025 902
## ENSECAG00000011978 15 14 12 15
## ENSECAG00000005166 456 249 355 275
## ENSECAG00000032916 643 356 478 453
## MAP9_LA_post MAP9_LV_midwall MAP9_RApost MAP9_RV_endo
## ENSECAG00000013298 305 309 233 248
## ENSECAG00000019402 0 0 0 0
## ENSECAG00000052423 1517 993 876 821
## ENSECAG00000011978 34 28 22 21
## ENSECAG00000005166 449 306 334 252
## ENSECAG00000032916 722 556 491 423
## San_Diego_LA_post San_Diego_LVFW San_Diego_RA_post
## ENSECAG00000013298 281 295 319
## ENSECAG00000019402 0 0 0
## ENSECAG00000052423 931 1108 1098
## ENSECAG00000011978 5 12 12
## ENSECAG00000005166 245 288 256
## ENSECAG00000032916 527 551 669
## San_Diego_RVFW Styled_LA_post Styled_LVFW Styled_RA_post
## ENSECAG00000013298 298 179 200 260
## ENSECAG00000019402 0 0 0 0
## ENSECAG00000052423 1002 1089 1151 1195
## ENSECAG00000011978 6 8 10 5
## ENSECAG00000005166 280 222 185 312
## ENSECAG00000032916 409 673 497 641
## Styled_RVFW
## ENSECAG00000013298 335
## ENSECAG00000019402 0
## ENSECAG00000052423 1058
## ENSECAG00000011978 18
## ENSECAG00000005166 314
## ENSECAG00000032916 463
# Remove version numbers from gene names if present (e.g., "Gene.1" -> "Gene").
rownames(count) <- stringr::str_split_fixed(string = rownames(count),
pattern = '[.]',
n = 2)[,1]
# Reorder metadata and annotation tables to match the column order of the count matrix.
column_order <- names(count)
meta_reordered <- meta[column_order, , drop = FALSE]
annot_order <- rownames(count)
annot_reordered <- annot[annot_order, ]
# Create a DGEList object for `edgeR` analysis.
# The DGEList object contains the count matrix, gene annotation, and sample metadata.
d <- DGEList(counts = count, genes = annot_reordered, samples = meta_reordered)
# Define the design matrix based on the experimental conditions
# This design matrix will help retain genes that are expressed in at least one condition
design <- model.matrix(~0 + Condition , d$samples)
colnames(design) <- gsub("Condition", "", colnames(design))
design
## LA_AF LA_Control LV_AF LV_Control RA_AF RA_Control RV_AF
## Dorado_LA_post 1 0 0 0 0 0 0
## Dorado_LVFW 0 0 1 0 0 0 0
## Dorado_RA_post 0 0 0 0 1 0 0
## Dorado_RVFW 0 0 0 0 0 0 1
## Im_A_Mets_Fan_LA_post 1 0 0 0 0 0 0
## Im_A_Mets_Fan_LVFW 0 0 1 0 0 0 0
## Im_a_Mets_Fan_RA_post 0 0 0 0 1 0 0
## Im_a_Mets_Fan_RVFW 0 0 0 0 0 0 1
## Jytte_LA_post 1 0 0 0 0 0 0
## Jytte_LVFW 0 0 1 0 0 0 0
## Jytte_RA_post 0 0 0 0 1 0 0
## Jytte_RVFW 0 0 0 0 0 0 1
## Kevin_Cook_LA_post 1 0 0 0 0 0 0
## Kevin_Cook_LVFW 0 0 1 0 0 0 0
## Kevin_Cook_RA_post 0 0 0 0 1 0 0
## Kevin_Cook_RVFW 0 0 0 0 0 0 1
## MAP11_LA_post 0 1 0 0 0 0 0
## MAP11_LV_midwall 0 0 0 1 0 0 0
## MAP11_RApost 0 0 0 0 0 1 0
## MAP11_RV_endo 0 0 0 0 0 0 0
## MAP3_LA_post 0 1 0 0 0 0 0
## MAP3_LV_midwall 0 0 0 1 0 0 0
## MAP3_RApost 0 0 0 0 0 1 0
## MAP3_RV_endo 0 0 0 0 0 0 0
## MAP4_LA_post 0 1 0 0 0 0 0
## MAP4_LV_midwall 0 0 0 1 0 0 0
## MAP4_RApost 0 0 0 0 0 1 0
## MAP4_RV_endo 0 0 0 0 0 0 0
## MAP6_LA_post 0 1 0 0 0 0 0
## MAP6_LV_midwall 0 0 0 1 0 0 0
## MAP6_RApost 0 0 0 0 0 1 0
## MAP6_RV_endo 0 0 0 0 0 0 0
## MAP8_LA_post 0 1 0 0 0 0 0
## MAP8_RV_endo 0 0 0 0 0 0 0
## MAP8_RApost 0 0 0 0 0 1 0
## MAP8_LV_midwall 0 0 0 1 0 0 0
## MAP9_LA_post 0 1 0 0 0 0 0
## MAP9_LV_midwall 0 0 0 1 0 0 0
## MAP9_RApost 0 0 0 0 0 1 0
## MAP9_RV_endo 0 0 0 0 0 0 0
## San_Diego_LA_post 1 0 0 0 0 0 0
## San_Diego_LVFW 0 0 1 0 0 0 0
## San_Diego_RA_post 0 0 0 0 1 0 0
## San_Diego_RVFW 0 0 0 0 0 0 1
## Styled_LA_post 1 0 0 0 0 0 0
## Styled_LVFW 0 0 1 0 0 0 0
## Styled_RA_post 0 0 0 0 1 0 0
## Styled_RVFW 0 0 0 0 0 0 1
## RV_Control
## Dorado_LA_post 0
## Dorado_LVFW 0
## Dorado_RA_post 0
## Dorado_RVFW 0
## Im_A_Mets_Fan_LA_post 0
## Im_A_Mets_Fan_LVFW 0
## Im_a_Mets_Fan_RA_post 0
## Im_a_Mets_Fan_RVFW 0
## Jytte_LA_post 0
## Jytte_LVFW 0
## Jytte_RA_post 0
## Jytte_RVFW 0
## Kevin_Cook_LA_post 0
## Kevin_Cook_LVFW 0
## Kevin_Cook_RA_post 0
## Kevin_Cook_RVFW 0
## MAP11_LA_post 0
## MAP11_LV_midwall 0
## MAP11_RApost 0
## MAP11_RV_endo 1
## MAP3_LA_post 0
## MAP3_LV_midwall 0
## MAP3_RApost 0
## MAP3_RV_endo 1
## MAP4_LA_post 0
## MAP4_LV_midwall 0
## MAP4_RApost 0
## MAP4_RV_endo 1
## MAP6_LA_post 0
## MAP6_LV_midwall 0
## MAP6_RApost 0
## MAP6_RV_endo 1
## MAP8_LA_post 0
## MAP8_RV_endo 1
## MAP8_RApost 0
## MAP8_LV_midwall 0
## MAP9_LA_post 0
## MAP9_LV_midwall 0
## MAP9_RApost 0
## MAP9_RV_endo 1
## San_Diego_LA_post 0
## San_Diego_LVFW 0
## San_Diego_RA_post 0
## San_Diego_RVFW 0
## Styled_LA_post 0
## Styled_LVFW 0
## Styled_RA_post 0
## Styled_RVFW 0
## attr(,"assign")
## [1] 1 1 1 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$Condition
## [1] "contr.treatment"
keep <- filterByExpr(d, design = design)
table(keep)
## keep
## FALSE TRUE
## 24339 14510
# Create a new DGEList object with only the retained genes
y <- d[keep, ,keep.lib.sizes=FALSE]
# Summary of the filtered DGEList object
cat("Number of genes & samples:", dim(y)[1], "&", dim(y)[2], "\n")
## Number of genes & samples: 14510 & 48
# calculate normalization factors (TMM normalization of library size using edgeR)
y <- calcNormFactors(y)
# create design matrix
design <- model.matrix(~0 + Condition , y$samples)
colnames(design) <- gsub("Condition", "", colnames(design))
design
## LA_AF LA_Control LV_AF LV_Control RA_AF RA_Control RV_AF
## Dorado_LA_post 1 0 0 0 0 0 0
## Dorado_LVFW 0 0 1 0 0 0 0
## Dorado_RA_post 0 0 0 0 1 0 0
## Dorado_RVFW 0 0 0 0 0 0 1
## Im_A_Mets_Fan_LA_post 1 0 0 0 0 0 0
## Im_A_Mets_Fan_LVFW 0 0 1 0 0 0 0
## Im_a_Mets_Fan_RA_post 0 0 0 0 1 0 0
## Im_a_Mets_Fan_RVFW 0 0 0 0 0 0 1
## Jytte_LA_post 1 0 0 0 0 0 0
## Jytte_LVFW 0 0 1 0 0 0 0
## Jytte_RA_post 0 0 0 0 1 0 0
## Jytte_RVFW 0 0 0 0 0 0 1
## Kevin_Cook_LA_post 1 0 0 0 0 0 0
## Kevin_Cook_LVFW 0 0 1 0 0 0 0
## Kevin_Cook_RA_post 0 0 0 0 1 0 0
## Kevin_Cook_RVFW 0 0 0 0 0 0 1
## MAP11_LA_post 0 1 0 0 0 0 0
## MAP11_LV_midwall 0 0 0 1 0 0 0
## MAP11_RApost 0 0 0 0 0 1 0
## MAP11_RV_endo 0 0 0 0 0 0 0
## MAP3_LA_post 0 1 0 0 0 0 0
## MAP3_LV_midwall 0 0 0 1 0 0 0
## MAP3_RApost 0 0 0 0 0 1 0
## MAP3_RV_endo 0 0 0 0 0 0 0
## MAP4_LA_post 0 1 0 0 0 0 0
## MAP4_LV_midwall 0 0 0 1 0 0 0
## MAP4_RApost 0 0 0 0 0 1 0
## MAP4_RV_endo 0 0 0 0 0 0 0
## MAP6_LA_post 0 1 0 0 0 0 0
## MAP6_LV_midwall 0 0 0 1 0 0 0
## MAP6_RApost 0 0 0 0 0 1 0
## MAP6_RV_endo 0 0 0 0 0 0 0
## MAP8_LA_post 0 1 0 0 0 0 0
## MAP8_RV_endo 0 0 0 0 0 0 0
## MAP8_RApost 0 0 0 0 0 1 0
## MAP8_LV_midwall 0 0 0 1 0 0 0
## MAP9_LA_post 0 1 0 0 0 0 0
## MAP9_LV_midwall 0 0 0 1 0 0 0
## MAP9_RApost 0 0 0 0 0 1 0
## MAP9_RV_endo 0 0 0 0 0 0 0
## San_Diego_LA_post 1 0 0 0 0 0 0
## San_Diego_LVFW 0 0 1 0 0 0 0
## San_Diego_RA_post 0 0 0 0 1 0 0
## San_Diego_RVFW 0 0 0 0 0 0 1
## Styled_LA_post 1 0 0 0 0 0 0
## Styled_LVFW 0 0 1 0 0 0 0
## Styled_RA_post 0 0 0 0 1 0 0
## Styled_RVFW 0 0 0 0 0 0 1
## RV_Control
## Dorado_LA_post 0
## Dorado_LVFW 0
## Dorado_RA_post 0
## Dorado_RVFW 0
## Im_A_Mets_Fan_LA_post 0
## Im_A_Mets_Fan_LVFW 0
## Im_a_Mets_Fan_RA_post 0
## Im_a_Mets_Fan_RVFW 0
## Jytte_LA_post 0
## Jytte_LVFW 0
## Jytte_RA_post 0
## Jytte_RVFW 0
## Kevin_Cook_LA_post 0
## Kevin_Cook_LVFW 0
## Kevin_Cook_RA_post 0
## Kevin_Cook_RVFW 0
## MAP11_LA_post 0
## MAP11_LV_midwall 0
## MAP11_RApost 0
## MAP11_RV_endo 1
## MAP3_LA_post 0
## MAP3_LV_midwall 0
## MAP3_RApost 0
## MAP3_RV_endo 1
## MAP4_LA_post 0
## MAP4_LV_midwall 0
## MAP4_RApost 0
## MAP4_RV_endo 1
## MAP6_LA_post 0
## MAP6_LV_midwall 0
## MAP6_RApost 0
## MAP6_RV_endo 1
## MAP8_LA_post 0
## MAP8_RV_endo 1
## MAP8_RApost 0
## MAP8_LV_midwall 0
## MAP9_LA_post 0
## MAP9_LV_midwall 0
## MAP9_RApost 0
## MAP9_RV_endo 1
## San_Diego_LA_post 0
## San_Diego_LVFW 0
## San_Diego_RA_post 0
## San_Diego_RVFW 0
## Styled_LA_post 0
## Styled_LVFW 0
## Styled_RA_post 0
## Styled_RVFW 0
## attr(,"assign")
## [1] 1 1 1 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$Condition
## [1] "contr.treatment"
# estimate dispersions
y <- estimateDisp(y, design)
# normalized expression levels (log2-transformed version of TMM-normalized CPM-values)
CPM <- cpm(y)
logCPM <- cpm(y, log = TRUE)
# Save the logCPM matrix to a CSV file
# write.csv(logCPM, "../output/Log2_TMM_normalized_CPM_values.csv", row.names = TRUE)
# Transform the logCPM matrix into long format for ggplot2 visualization.
# This will allow each gene/sample pair to be plotted individually.
logCPM_melted <- data.table::melt(logCPM)
## Warning in melt.default(logCPM): The melt generic in data.table has been passed
## a matrix and will attempt to redirect to the relevant reshape2 method; please
## note that reshape2 is superseded and is no longer actively developed, and this
## redirection is now deprecated. To continue using melt methods from reshape2
## while both libraries are attached, e.g. melt.list, you can prepend the
## namespace, i.e. reshape2::melt(logCPM). In the next version, this warning will
## become an error.
# Rename columns for better readability and consistency.
# "Var1" and "Var2" refer to the dimensions of the matrix and are renamed to "Gene" and "Sample".
setnames(x = logCPM_melted,
old = c("Var1", "Var2", "value"),
new = c("Gene", "Sample", "logCPM"))
# Add experimental conditions like "Group" or "Region" to each sample for coloring in the plots.
logCPM_melted <- data.table::merge.data.table(x = logCPM_melted,
y = data.table(Sample = rownames(meta), meta),
by = "Sample")
# Color the boxes based on the experimental condition to highlight potential differences.
ggplot(logCPM_melted, aes(x = Sample, y = logCPM)) +
geom_boxplot(aes(color = Condition)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
ggtitle("logCPM boxplots")
# Create density plots of logCPM values for each sample.
# This visualization helps to assess the distribution of expression values and identify batch effects.
ggplot(logCPM_melted, aes(x = logCPM)) +
geom_density(aes(group = Sample, color = Condition)) +
theme_bw() +
ggtitle("logCPM density distributions")
# Define custom colors and shapes for PCA plots
condition_shapes <- c("Control" = 16, "AF" = 17) # Circle for Control, Triangle for AF
# Perform PCA on the logCPM values
pca <- prcomp(t(logCPM)) # Transpose logCPM so that samples are in rows and genes are in columns
# Extract PCA scores
pca_scores <- as.data.frame(pca$x) # PCA scores
pca_scores$Sample <- rownames(pca_scores) # Add sample names
# Merge with metadata for plotting
pca_scores <- merge(pca_scores, meta, by.x = "Sample", by.y = "row.names")
# Define dimensions for plotting
dims <- list(p1 = c("PC1", "PC2"), p2 = c("PC1", "PC3"), p3 = c("PC1", "PC4"))
pca_plot <- list()
# Without labels
for (i in seq_along(dims)){
pca_plot[[i]] <- ggplot(pca_scores, aes_string(x = dims[[i]][1], y = dims[[i]][2], color = "Region", shape = "Group")) +
geom_point(size = 4, alpha = 0.8) +
scale_color_manual(values = chamber_colors) +
scale_shape_manual(values = condition_shapes) +
theme_minimal() +
labs(title = paste("PCA:", dims[[i]][1], "vs", dims[[i]][2]), x = dims[[i]][1], y = dims[[i]][2]) +
theme(legend.position = "right")
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Combine plots using patchwork
patchwork::wrap_plots(pca_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')
# With labels
for (i in seq_along(dims)){
pca_plot[[i]] <- ggplot(pca_scores, aes_string(x = dims[[i]][1], y = dims[[i]][2], color = "Region", shape = "Group", label = "Horse")) +
geom_point(size = 4, alpha = 0.8) +
geom_text(vjust = -0.5, size = 3) + # Add labels for each sample
scale_color_manual(values = chamber_colors) +
scale_shape_manual(values = condition_shapes) +
theme_minimal() +
labs(title = paste("PCA:", dims[[i]][1], "vs", dims[[i]][2]), x = dims[[i]][1], y = dims[[i]][2]) +
theme(legend.position = "right")
}
# Combine labeled plots using patchwork
patchwork::wrap_plots(pca_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')
### Extract top loadings
# Define a function to extract the top contributing genes for each component
get_top_loadings <- function(pca_obj, pc_num, top_n = 10) {
# Extract loadings for the specified component
loadings <- pca_obj$rotation[, pc_num]
# Get the top genes with highest absolute loadings
top_genes <- sort(abs(loadings), decreasing = TRUE)[1:top_n]
# Extract gene names and actual loading values for the top genes
top_loadings <- data.frame(
Gene = names(top_genes), # Gene names
Loading = loadings[names(top_genes)], # Loading values
PC = paste0("PC", pc_num) # Principal Component identifier
)
return(top_loadings)
}
### Verdict: Batch effect relating to each individual horse contributing four samples
# Remove batch effect using `removeBatchEffect` before PCA
logCPM_batchRemoved <- removeBatchEffect(logCPM, batch = y$samples$Horse, design = design)
## Coefficients not estimable: batch11
## Warning: Partial NA coefficients for 14510 probe(s)
# Perform PCA on batch-corrected logCPM values
pca_batchRemoved <- prcomp(t(logCPM_batchRemoved))
# Extract PCA scores and add sample information
pca_scores <- as.data.frame(pca_batchRemoved$x)
pca_scores$Sample <- rownames(pca_scores)
# Merge PCA scores with metadata
pca_scores <- merge(pca_scores, meta, by.x = "Sample", by.y = "row.names")
# Define dimensions for PCA plotting
dims <- list(p1 = c("PC1", "PC2"), p2 = c("PC2", "PC3"), p3 = c("PC1", "PC4"))
pca_plot <- list()
# Create PCA plots without labels
for (i in seq_along(dims)){
pca_plot[[i]] <- ggplot(pca_scores, aes_string(x = dims[[i]][1], y = dims[[i]][2], color = "Region", shape = "Group")) +
geom_point(size = 4, alpha = 0.8) +
scale_color_manual(values = chamber_colors) +
scale_shape_manual(values = condition_shapes) +
theme_minimal() +
labs(title = paste("PCA:", dims[[i]][1], "vs", dims[[i]][2]), x = dims[[i]][1], y = dims[[i]][2]) +
theme(legend.position = "right")
}
# Combine PCA plots using patchwork
patchwork::wrap_plots(pca_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')
# Create PCA plots with labels for each sample
for (i in seq_along(dims)){
pca_plot[[i]] <- ggplot(pca_scores, aes_string(x = dims[[i]][1], y = dims[[i]][2], color = "Region", shape = "Group", label = "Horse")) +
geom_point(size = 4, alpha = 0.8) +
geom_text(vjust = -0.5, size = 3) + # Add labels for each sample
scale_color_manual(values = chamber_colors) +
scale_shape_manual(values = condition_shapes) +
theme_minimal() +
labs(title = paste("PCA:", dims[[i]][1], "vs", dims[[i]][2]), x = dims[[i]][1], y = dims[[i]][2]) +
theme(legend.position = "right")
}
# Combine labeled PCA plots using patchwork
patchwork::wrap_plots(pca_plot, ncol = 2) + patchwork::plot_layout(guides = 'collect')
# Find top loadings
## Function to get all loadings for specified principal components and save to one file
extract_all_loadings <- function(pca_obj, pcs = 1:6, annot) {
# Extract all loadings for the specified principal components
all_loadings <- pca_obj$rotation[, pcs, drop = FALSE]
# Convert to a data frame for easy manipulation
loadings_df <- as.data.frame(all_loadings)
# Add ENSEMBL IDs as a column
loadings_df$ENSEMBL <- rownames(loadings_df)
# Merge with annotation to include gene names
loadings_df <- merge(loadings_df, annot[, c("ENSEMBL", "GENENAME")], by = "ENSEMBL", all.x = TRUE)
# Rearrange columns to have GENENAME and ENSEMBL first for clarity
loadings_df <- loadings_df[, c("GENENAME", "ENSEMBL", colnames(loadings_df)[1:(ncol(loadings_df) - 2)])]
return(loadings_df)
}
# Extract loadings for PC1 to PC6
all_loadings_df <- extract_all_loadings(pca_obj = pca_batchRemoved, pcs = 1:6, annot = annot_reordered)
# Save the combined loadings to a single CSV file
write.csv(all_loadings_df, file = "../output/All_Loadings_PC1_to_PC6.csv", row.names = FALSE)
# Preview the combined loadings
head(all_loadings_df)
## GENENAME ENSEMBL ENSEMBL.1 PC1 PC2
## 1 MTPAP ENSECAG00000000001 ENSECAG00000000001 0.0021249993 -3.761005e-04
## 2 R3HDM4 ENSECAG00000000003 ENSECAG00000000003 0.0011267619 -4.816464e-03
## 3 RNF170 ENSECAG00000000004 ENSECAG00000000004 0.0007005666 4.874067e-03
## 4 TMEM70 ENSECAG00000000007 ENSECAG00000000007 0.0077844405 -2.652240e-06
## 5 RPLP2 ENSECAG00000000008 ENSECAG00000000008 -0.0023290581 -5.216934e-03
## 6 LRRC58 ENSECAG00000000010 ENSECAG00000000010 -0.0052616115 3.010118e-03
## PC3 PC4 PC5
## 1 -5.083679e-03 -0.0003394471 -0.002158191
## 2 5.224774e-04 0.0014866072 -0.001906639
## 3 -2.923870e-06 0.0002625384 -0.000943564
## 4 4.748882e-03 -0.0030212221 -0.004118943
## 5 1.725059e-02 -0.0110220681 -0.002596778
## 6 -1.455616e-03 0.0045966671 0.007107533
# Source the helper functions
source("pca_helpers_all_samples.R")
# Function to extract loadings based on highlighted genes
extract_and_map_loadings <- function(pca_obj, gene_list, annot) {
ensembl_ids <- annot$ENSEMBL[match(gene_list, annot$GENENAME)]
ensembl_ids <- ensembl_ids[!is.na(ensembl_ids)]
valid_ids <- intersect(ensembl_ids, rownames(pca_obj$rotation))
if (length(valid_ids) == 0) {
warning("No matching ENSEMBL IDs found in PCA loadings.")
return(NULL)
}
loadings_df <- as.data.frame(pca_obj$rotation[valid_ids, , drop = FALSE])
loadings_df$ENSEMBL <- rownames(loadings_df)
loadings_df <- merge(loadings_df, annot[, c("ENSEMBL", "GENENAME")], by = "ENSEMBL", all.x = TRUE)
return(loadings_df)
}
### FOR PC1 & PC2
# Highlighted genes for PCA loadings extraction
highlighted_genes <- c("SCD5", "PITX2", "KCNK3", "BMP10", "KCVN2", "IRX2", "NR4A3", "MYH7")
specific_loadings_df <- extract_and_map_loadings(pca_batchRemoved, highlighted_genes, annot_reordered)
pca_batchRemoved <- prcomp(t(logCPM_batchRemoved))
pca_scores <- as.data.frame(pca_batchRemoved$x) # PCA scores
pca_scores$Sample <- rownames(pca_scores) # Add sample names
pca_scores <- merge(pca_scores, meta, by.x = "Sample", by.y = "row.names")
# Generate PCA plots without and with labels
dimensions <- list(c("PC1", "PC2"))
# Generate and display PCA plots for all samples (both healthy and disease groups)
generate_and_save_pca_plots(
pca_df = pca_scores,
loadings_df = specific_loadings_df,
dimensions = dimensions,
colors = chamber_colors,
shapes = condition_shapes, # Include shapes for control/disease
add_ellipse = FALSE, # Optionally toggle to TRUE if ellipses are needed
shape_var = "Group" # Use Group as the shape variable
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
### FOR PC2 & PC3, PC1 & PC4
# Highlighted genes for PCA loadings extraction
highlighted_genes <- c("NPPA", "KCNJ5", "KCNK3", "NPPB", "MYL7", "KCNV2", "PITX2", "BMP10",
"IRX4", "IRX5", "IRX3", "KCNJ2", "SCD5")
specific_loadings_df <- extract_and_map_loadings(pca_batchRemoved, highlighted_genes, annot_reordered)
print(specific_loadings_df)
## ENSEMBL PC1 PC2 PC3 PC4
## 1 ENSECAG00000007670 -0.05227401 -0.039228131 0.121752401 0.197294927
## 2 ENSECAG00000007922 0.05444175 0.006931374 -0.018064339 -0.016126856
## 3 ENSECAG00000008407 -0.06542148 0.016219977 -0.050015231 -0.065636026
## 4 ENSECAG00000014282 -0.07571580 0.008074537 0.085644536 -0.040248734
## 5 ENSECAG00000014698 -0.01426309 0.078253440 -0.021441142 0.035740370
## 6 ENSECAG00000014892 -0.13160409 -0.010678639 0.031471107 -0.030311520
## 7 ENSECAG00000016602 -0.07321082 0.006809435 -0.047207178 -0.100039607
## 8 ENSECAG00000017403 0.07276413 0.008245171 0.006803378 -0.003054911
## 9 ENSECAG00000018912 -0.07433800 -0.004520907 -0.009058442 -0.004476423
## 10 ENSECAG00000023496 0.06668556 0.011300322 0.031639029 0.035227351
## 11 ENSECAG00000023545 0.07361160 0.009361508 0.004299142 0.003657962
## 12 ENSECAG00000025007 -0.10905821 -0.009476101 -0.012414520 -0.002759771
## 13 ENSECAG00000038775 -0.07815285 -0.003381890 -0.024213114 -0.022184192
## PC5 PC6 PC7 PC8 PC9
## 1 -0.034517979 -0.105035693 0.0335415778 2.339717e-03 -0.050400558
## 2 -0.024091972 0.002825550 0.0167563366 -1.699736e-02 0.009059961
## 3 -0.009609884 0.008048334 -0.0196856531 -2.058912e-02 0.027077607
## 4 0.046901363 0.119284403 0.0245399524 -3.206733e-03 0.037957597
## 5 -0.013571135 -0.017373316 -0.0067161956 2.853471e-02 0.002363195
## 6 0.052735794 0.034702499 0.0405253381 -2.819983e-02 0.017444449
## 7 0.021847510 0.037233077 -0.0003363777 -3.863412e-02 -0.035901694
## 8 -0.003377872 0.024508447 0.0034894687 2.130012e-03 0.012685469
## 9 -0.022211013 0.016516799 0.0115826854 1.070533e-02 -0.006952828
## 10 -0.006385543 -0.009435043 0.0281000028 -2.057182e-03 0.020087719
## 11 0.005779000 0.011568931 0.0089681269 -2.026065e-04 0.020113590
## 12 0.012384525 -0.031183442 0.0036119556 1.845239e-05 0.019176946
## 13 -0.014226770 -0.038084658 0.0230052610 1.230566e-02 -0.017696928
## PC10 PC11 PC12 PC13 PC14
## 1 -0.0406682010 -0.0478721050 -0.0210383924 -0.0062902000 -0.0109909121
## 2 -0.0009258402 0.0032837035 -0.0034883619 0.0030770604 -0.0083290567
## 3 0.0222326443 0.0456264398 -0.0559828397 0.0208894431 0.0113645351
## 4 -0.0036575229 -0.0263514299 0.0331283604 -0.0114431434 0.0241195224
## 5 -0.0344942055 -0.0043542906 0.0002893075 -0.0097573266 -0.0069985513
## 6 0.0035872669 -0.0407175466 -0.0159527793 -0.0169374856 -0.0055278893
## 7 -0.0041344181 0.0075445749 0.0412638917 -0.0004439656 -0.0133117530
## 8 0.0045333676 0.0031444501 0.0009855603 0.0034574046 -0.0062771485
## 9 -0.0210436549 0.0005091515 -0.0100681846 -0.0131162841 -0.0306323267
## 10 -0.0051524854 -0.0229096684 0.0228149929 0.0002942752 0.0122788597
## 11 0.0139444146 0.0015228059 0.0190142937 -0.0003578966 -0.0004129403
## 12 -0.0213580555 -0.0270250041 -0.0492959181 0.0004178917 0.0192838555
## 13 -0.0076833884 -0.0057037608 0.0110135729 -0.0049312236 -0.0089598505
## PC15 PC16 PC17 PC18 PC19
## 1 0.0844996231 -0.022799122 -0.052544404 1.526425e-02 -0.0481451338
## 2 -0.0232162905 0.006346367 -0.024314059 1.606317e-02 -0.0137897135
## 3 0.0033673776 0.001489856 0.055077502 1.176700e-02 0.0116164146
## 4 0.0765147147 -0.049093865 -0.092422580 1.638500e-02 -0.0870742840
## 5 -0.0296014639 0.017422842 -0.017792195 -3.174248e-02 -0.0047453487
## 6 0.0386653985 -0.007406411 -0.055815696 -4.169079e-02 -0.0156626034
## 7 0.0359420540 -0.025129301 0.044267877 5.754526e-02 -0.0437472351
## 8 -0.0173147721 0.008564201 -0.011969621 1.376638e-02 -0.0197647394
## 9 0.0104503209 0.045897760 0.046648701 7.647692e-03 -0.0206674502
## 10 0.0009862748 -0.010919098 -0.013678301 7.264628e-05 -0.0215725804
## 11 -0.0045916253 0.005491875 -0.007974125 1.403380e-03 -0.0076793106
## 12 0.0184696724 -0.002795138 0.012268096 -1.332122e-02 -0.0003808918
## 13 0.0113647277 -0.003376706 0.008903726 -1.095808e-03 0.0010683520
## PC20 PC21 PC22 PC23 PC24
## 1 0.008567295 0.0302295414 -0.007939727 -0.014787966 -7.554412e-02
## 2 0.023617679 0.0132745607 0.029442843 -0.008286473 4.215635e-03
## 3 0.029170852 0.0306411189 -0.012015233 0.015105768 1.153044e-02
## 4 0.021252825 0.0366026327 0.029951521 0.046386657 -5.406510e-03
## 5 0.017830116 -0.0155500570 -0.013883506 -0.010305925 -1.236698e-03
## 6 0.016287730 -0.0055569385 -0.012478637 0.035907356 -3.230011e-02
## 7 0.075896695 -0.0078088451 -0.055552710 -0.006321799 -3.794545e-02
## 8 -0.006002097 -0.0064265657 -0.001602502 -0.002206307 4.516589e-05
## 9 0.020273897 -0.0007999865 -0.027350136 0.021036989 -6.709532e-03
## 10 0.007441009 -0.0003734871 0.004379471 0.003845154 1.153018e-02
## 11 -0.013762579 -0.0111086607 0.016008915 0.009729761 1.167720e-02
## 12 0.009580203 0.0297791220 -0.007373942 -0.016707403 -5.573077e-03
## 13 -0.016331578 0.0079938043 -0.005010974 -0.015417255 -5.654312e-03
## PC25 PC26 PC27 PC28 PC29
## 1 -0.0205663975 0.014295363 -0.0087117156 -0.005819478 0.0180242568
## 2 0.0237773847 0.005400292 0.0093635200 0.002372151 -0.0019047291
## 3 -0.0145585464 0.009824951 -0.0035725748 -0.002156993 0.0279208497
## 4 -0.0201255986 0.053814428 0.0288566341 0.006196124 0.0002587613
## 5 -0.0126381608 -0.016300043 0.0180666819 0.003694298 -0.0163138720
## 6 0.0110771342 0.028460193 0.0260018302 -0.012432077 -0.0014149188
## 7 -0.0079037742 0.015396341 0.0007386441 -0.006959592 0.0087293711
## 8 0.0060093280 0.009904557 0.0085054539 -0.002073155 -0.0112026564
## 9 -0.0233336255 -0.009082222 0.0274029416 0.006530430 -0.0127174215
## 10 -0.0178076440 0.006702277 0.0091208121 -0.004271487 -0.0100109031
## 11 0.0005913259 0.002650115 0.0135145302 0.000128555 -0.0051144449
## 12 -0.0036590888 -0.037978342 -0.0144735318 -0.011264403 -0.0138719753
## 13 0.0044095005 -0.016957107 -0.0108380786 -0.002016082 -0.0051381850
## PC30 PC31 PC32 PC33 PC34
## 1 2.647413e-02 0.0395826679 0.0078979940 -0.018056867 -0.0193066792
## 2 -5.068716e-03 0.0141736564 -0.0061971533 0.003554042 -0.0155269909
## 3 3.649226e-03 0.0061833439 -0.0269071426 0.023478244 -0.0002811612
## 4 2.981332e-03 0.0141465915 0.0060626054 -0.017997391 0.0196690499
## 5 3.287966e-03 -0.0033201920 0.0004104974 -0.006914804 -0.0018425302
## 6 -1.894172e-02 -0.0132958103 0.0102201596 -0.026508146 -0.0066956467
## 7 -5.129332e-04 0.0148340341 -0.0246611495 0.029082143 -0.0086514211
## 8 -3.898721e-03 -0.0009469725 -0.0016352811 -0.001373161 -0.0040737581
## 9 6.113522e-05 -0.0071506911 0.0153339779 -0.025660998 0.0252363176
## 10 1.970222e-03 0.0131020377 0.0121681664 -0.004870636 -0.0044985051
## 11 5.329238e-03 0.0049612414 -0.0069427304 0.007120660 -0.0015881873
## 12 1.744893e-02 -0.0040130015 0.0020424721 0.021831175 0.0113678949
## 13 1.262867e-02 -0.0015129147 0.0007094371 0.003424982 0.0162941417
## PC35 PC36 PC37 PC38 PC39
## 1 0.002543571 -0.0117022055 0.0194024332 -0.0020501829 0.0073637191
## 2 0.002958825 -0.0077399371 0.0178359947 0.0065947068 -0.0010071189
## 3 -0.008724012 0.0045279034 0.0005077024 -0.0051548740 -0.0028603585
## 4 -0.024182442 -0.0427127628 0.0212038159 0.0110237900 -0.0012328612
## 5 0.020355398 -0.0008293688 -0.0010942205 0.0028768728 0.0048495994
## 6 -0.033355870 -0.0329966426 -0.0221282768 0.0143774401 0.0005603990
## 7 -0.014124847 0.0144513606 -0.0097817597 0.0009869065 0.0075041605
## 8 -0.003831058 -0.0056278318 -0.0012771670 0.0004016718 -0.0007627094
## 9 -0.006863702 0.0426738753 0.0394846895 0.0216536041 -0.0034623673
## 10 0.005676438 0.0002761843 0.0020246969 0.0009825398 -0.0009557223
## 11 -0.012430614 -0.0018737929 -0.0015530616 0.0011662163 0.0007436669
## 12 -0.022542480 -0.0293402778 -0.0205176657 0.0007955571 0.0033599140
## 13 -0.004238412 0.0093505483 -0.0037253126 0.0056077852 0.0072692182
## PC40 PC41 PC42 PC43 PC44
## 1 0.0027995035 -0.0025804007 0.0048109152 0.0019067998 -2.724551e-03
## 2 -0.0016406878 -0.0023061972 0.0066992635 0.0027350809 3.098881e-03
## 3 -0.0001642517 -0.0048118442 -0.0056442701 -0.0061145259 8.431813e-04
## 4 -0.0034592852 0.0095510327 0.0016660918 0.0024528068 2.041669e-03
## 5 0.0008327220 0.0003289098 0.0051658690 0.0005924205 2.859167e-05
## 6 -0.0058885419 0.0044003815 -0.0010219476 -0.0064551824 3.851411e-03
## 7 0.0005482449 -0.0001455944 -0.0065028808 -0.0080517266 -7.295247e-03
## 8 0.0003770352 0.0021081403 0.0010751785 0.0020354963 -6.328756e-04
## 9 -0.0085429206 -0.0009096319 0.0205239598 -0.0029709711 4.134153e-03
## 10 -0.0004060854 -0.0005120010 0.0001754743 0.0046099293 -2.204789e-03
## 11 0.0005109423 -0.0025078151 0.0021326524 -0.0008072663 -1.067690e-03
## 12 -0.0003787278 -0.0017265263 0.0060458352 0.0014307225 7.490251e-03
## 13 0.0018977171 -0.0042130539 0.0039727563 -0.0034226730 6.030265e-03
## PC45 PC46 PC47 PC48 GENENAME
## 1 0.0101801552 -0.0041970778 -4.504517e-03 -0.010356680 BMP10
## 2 -0.0099542719 -0.0045540547 1.036295e-03 0.006684751 KCNJ2
## 3 -0.0007933573 0.0008043687 -3.493117e-03 -0.001027548 PITX2
## 4 -0.0056645798 0.0076371832 -9.791100e-04 0.004787730 NPPB
## 5 -0.0068101493 0.0053138297 -1.765072e-02 0.004397189 SCD5
## 6 -0.0041321545 0.0005382408 5.461900e-04 -0.008428587 NPPA
## 7 -0.0009958840 -0.0032948479 7.587697e-04 -0.003991923 KCNV2
## 8 -0.0015659203 0.0022462584 -3.595410e-03 0.003209435 IRX4
## 9 -0.0125983413 -0.0070993250 3.789036e-04 0.012431994 MYL7
## 10 -0.0006508715 0.0057216033 3.520730e-03 0.010179654 IRX3
## 11 -0.0023433898 -0.0067091561 1.290452e-05 0.004801349 IRX5
## 12 -0.0019563549 -0.0052078546 2.882416e-03 0.007880405 KCNJ5
## 13 -0.0014009894 -0.0114040492 2.110588e-03 0.006649243 KCNK3
pca_batchRemoved <- prcomp(t(logCPM_batchRemoved))
pca_scores <- as.data.frame(pca_batchRemoved$x) # PCA scores
pca_scores$Sample <- rownames(pca_scores) # Add sample names
pca_scores <- merge(pca_scores, meta, by.x = "Sample", by.y = "row.names")
dimensions <- list(c("PC1", "PC3"), c("PC1", "PC4"))
generate_and_save_pca_plots(
pca_df = pca_scores,
loadings_df = specific_loadings_df,
dimensions = dimensions,
colors = chamber_colors,
shapes = condition_shapes, # Include shapes for control/disease
add_ellipse = FALSE, # Optionally toggle to TRUE if ellipses are needed
shape_var = "Group" # Use Group as the shape variable
)
### Extract variance explained by each component
# Extract the standard deviation of each principal component
pca_sdev <- pca_batchRemoved$sdev
# Calculate the variance explained by each component
variance_explained <- (pca_sdev^2) / sum(pca_sdev^2)
# Convert to a data frame for easier viewing
variance_explained_df <- data.frame(
Principal_Component = paste0("PC", 1:length(variance_explained)),
Variance_Explained = variance_explained,
Cumulative_Variance = cumsum(variance_explained)
)
# Print variance explained by each component
print(variance_explained_df)
## Principal_Component Variance_Explained Cumulative_Variance
## 1 PC1 3.480855e-01 0.3480855
## 2 PC2 2.934088e-01 0.6414942
## 3 PC3 4.140402e-02 0.6828982
## 4 PC4 3.494612e-02 0.7178444
## 5 PC5 2.789915e-02 0.7457435
## 6 PC6 2.438526e-02 0.7701288
## 7 PC7 2.076548e-02 0.7908943
## 8 PC8 1.893908e-02 0.8098333
## 9 PC9 1.549361e-02 0.8253269
## 10 PC10 1.438969e-02 0.8397166
## 11 PC11 1.392454e-02 0.8536412
## 12 PC12 1.033311e-02 0.8639743
## 13 PC13 9.470018e-03 0.8734443
## 14 PC14 8.604413e-03 0.8820487
## 15 PC15 7.956783e-03 0.8900055
## 16 PC16 7.552653e-03 0.8975581
## 17 PC17 7.105511e-03 0.9046637
## 18 PC18 6.700273e-03 0.9113639
## 19 PC19 6.488442e-03 0.9178524
## 20 PC20 5.906215e-03 0.9237586
## 21 PC21 5.822941e-03 0.9295815
## 22 PC22 5.438269e-03 0.9350198
## 23 PC23 5.286348e-03 0.9403061
## 24 PC24 5.042680e-03 0.9453488
## 25 PC25 4.986711e-03 0.9503355
## 26 PC26 4.737013e-03 0.9550725
## 27 PC27 4.689348e-03 0.9597619
## 28 PC28 4.598649e-03 0.9643605
## 29 PC29 4.423403e-03 0.9687839
## 30 PC30 4.283260e-03 0.9730672
## 31 PC31 4.153394e-03 0.9772206
## 32 PC32 4.056323e-03 0.9812769
## 33 PC33 3.922411e-03 0.9851993
## 34 PC34 3.861792e-03 0.9890611
## 35 PC35 3.757164e-03 0.9928183
## 36 PC36 3.652218e-03 0.9964705
## 37 PC37 3.529492e-03 1.0000000
## 38 PC38 1.991836e-30 1.0000000
## 39 PC39 7.776743e-31 1.0000000
## 40 PC40 5.789392e-31 1.0000000
## 41 PC41 4.999409e-31 1.0000000
## 42 PC42 2.148111e-31 1.0000000
## 43 PC43 1.925285e-31 1.0000000
## 44 PC44 1.196867e-31 1.0000000
## 45 PC45 1.145573e-31 1.0000000
## 46 PC46 7.838796e-32 1.0000000
## 47 PC47 5.926737e-32 1.0000000
## 48 PC48 5.909015e-32 1.0000000
# Subset the metadata to include only "Control" horses
healthy_meta <- meta[meta$Group == "Control", ]
# Subset the count matrix to include only columns (samples) of healthy horses
healthy_samples <- rownames(healthy_meta) # Get sample names of healthy horses
logCPM_healthy <- logCPM[, healthy_samples] # Subset the logCPM matrix
# Use removeBatchEffect with healthy data only
logCPM_healthy_batchRemoved <- removeBatchEffect(logCPM_healthy,
batch = y$samples[healthy_samples, ]$Horse,
design = design[healthy_samples, ])
## Coefficients not estimable: LA_AF LV_AF RA_AF RV_AF
## Warning: Partial NA coefficients for 14510 probe(s)
#Perform PCA on the healthy horses' logCPM values
pca_healthy <- prcomp(t(logCPM_healthy_batchRemoved))
# Extract PCA scores for healthy horses
pca_scores_healthy <- as.data.frame(pca_healthy$x) # PCA scores for healthy horses
pca_scores_healthy$Sample <- rownames(pca_scores_healthy) # Add sample names
# Merge with the healthy subset of metadata for plotting
pca_scores_healthy <- merge(pca_scores_healthy, healthy_meta, by.x = "Sample", by.y = "row.names")
# Define dimensions for PCA plotting
dims <- list(p1 = c("PC1", "PC2"), p2 = c("PC1", "PC3"), p3 = c("PC2", "PC3"))
pca_plot_healthy <- list()
# Create PCA plots for healthy horses using predefined colors and shapes
for (i in seq_along(dims)){
pca_plot_healthy[[i]] <- ggplot(pca_scores_healthy, aes_string(x = dims[[i]][1], y = dims[[i]][2],
color = "Region", fill = "Region")) +
geom_point(size = 4, alpha = 0.8, stroke = 1.5) + # Adjusted to include border color
scale_color_manual(values = chamber_colors) + # Outer colors
scale_fill_manual(values = chamber_fill_colors) + # Fill colors
scale_shape_manual(values = condition_shapes) +
theme_minimal() +
labs(title = paste("PCA (Healthy Horses):", dims[[i]][1], "vs", dims[[i]][2]),
x = dims[[i]][1], y = dims[[i]][2]) +
theme(legend.position = "right")
}
patchwork::wrap_plots(pca_plot_healthy, ncol = 2) + patchwork::plot_layout(guides = 'collect')
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's shape values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's shape values.
## No shared levels found between `names(values)` of the manual scale and the
## data's shape values.
# With labels
for (i in seq_along(dims)){
pca_plot_healthy[[i]] <- ggplot(pca_scores_healthy, aes_string(x = dims[[i]][1], y = dims[[i]][2], color = "Region", shape = "Group", label = "Horse")) +
geom_point(size = 4, alpha = 0.8) +
geom_text(vjust = -0.5, size = 3) + # Add labels for each sample
scale_color_manual(values = chamber_colors) +
scale_shape_manual(values = condition_shapes) +
theme_minimal() +
labs(title = paste("PCA:", dims[[i]][1], "vs", dims[[i]][2]), x = dims[[i]][1], y = dims[[i]][2]) +
theme(legend.position = "right")
}
# Combine PCA plots for healthy horses using patchwork
patchwork::wrap_plots(pca_plot_healthy, ncol = 2) + patchwork::plot_layout(guides = 'collect')
# PCA loadings highlight the genes contributing most to principal components,
# helping identify key drivers of transcriptional variability; supplementing DGE results
# Source the helper functions
source("pca_helpers.R")
#Extract loadings for plot
# Unified function to extract specific loadings with gene names
extract_and_map_loadings <- function(pca_obj, gene_list, annot) {
# Map gene symbols to ENSEMBL IDs and remove NAs
ensembl_ids <- annot$ENSEMBL[match(gene_list, annot$GENENAME)]
ensembl_ids <- ensembl_ids[!is.na(ensembl_ids)]
# Extract loadings for the mapped ENSEMBL IDs present in PCA loadings
valid_ids <- intersect(ensembl_ids, rownames(pca_obj$rotation))
if (length(valid_ids) == 0) {
warning("No matching ENSEMBL IDs found in PCA loadings.")
return(NULL)
}
# Extract loadings and add gene names from annotation
loadings_df <- as.data.frame(pca_obj$rotation[valid_ids, , drop = FALSE])
loadings_df$ENSEMBL <- rownames(loadings_df)
loadings_df <- merge(loadings_df, annot[, c("ENSEMBL", "GENENAME")], by = "ENSEMBL", all.x = TRUE)
return(loadings_df)
}
# Example gene list for loadings extraction (modify as needed)
highlighted_genes <- c("BMP10", "NPPA", "NPPB", "KCNJ5", "KCNK3", "MYL7", "KCNV2", "PCDH7", "KIF18B")
# Extract loadings with unified function
specific_loadings_df <- extract_and_map_loadings(pca_healthy, highlighted_genes, annot_reordered)
# Check results
print(specific_loadings_df)
## ENSEMBL PC1 PC2 PC3 PC4
## 1 ENSECAG00000000599 0.04128203 -0.07121482 0.030304906 -0.032071230
## 2 ENSECAG00000007670 -0.04187792 -0.05344778 -0.182171372 0.006027121
## 3 ENSECAG00000014282 -0.04908616 -0.10659291 0.118463723 -0.002990045
## 4 ENSECAG00000014892 -0.10947283 -0.02092027 0.062474966 0.003249863
## 5 ENSECAG00000016602 -0.07236999 0.03013959 0.085731061 -0.050952951
## 6 ENSECAG00000018912 -0.07944678 -0.01610544 0.011473745 -0.012164607
## 7 ENSECAG00000023019 -0.03526124 0.08195650 -0.030860436 -0.030426407
## 8 ENSECAG00000025007 -0.10372285 0.02219651 -0.008121261 0.038637045
## 9 ENSECAG00000038775 -0.08268097 0.04205286 0.000986836 -0.013644879
## PC5 PC6 PC7 PC8 PC9
## 1 0.005743619 -0.009509976 0.0004020223 -0.001940592 0.006663117
## 2 -0.126553308 0.041239220 -0.0610635273 0.067588478 0.009910526
## 3 -0.033062846 -0.035715429 -0.1216359789 0.055189123 0.049599924
## 4 -0.020083032 -0.037107553 -0.0955982451 -0.004075129 0.014493231
## 5 -0.056147648 -0.020596374 -0.0028262312 0.027970752 0.064543567
## 6 -0.013894399 0.019483412 -0.0068119499 0.021592442 0.014527742
## 7 -0.026246107 0.007321278 0.0335366847 -0.027098257 -0.023158184
## 8 -0.007536985 0.026516548 -0.0133554256 0.023942239 -0.036670701
## 9 -0.008717469 0.012687793 0.0082786036 0.006067485 -0.012061662
## PC10 PC11 PC12 PC13 PC14
## 1 -0.014348359 -0.029720120 -0.0029925903 6.080895e-03 -0.0008997756
## 2 -0.029649721 -0.017658428 -0.0786358849 4.052689e-02 0.0196434957
## 3 -0.009567100 -0.069878973 -0.0230841113 -4.260866e-05 -0.0087791247
## 4 0.002631524 0.001711544 -0.0096136133 -1.676756e-02 -0.0038826057
## 5 0.036220864 -0.005228765 -0.0258024831 3.651355e-02 0.0061294993
## 6 0.043432730 -0.007032016 0.0054679749 -1.133375e-02 0.0234860617
## 7 -0.007601022 0.008123758 -0.0050531598 -6.023305e-04 0.0049842649
## 8 0.006685247 0.023159496 -0.0046750647 2.163485e-02 0.0329738243
## 9 -0.001199439 0.015060327 -0.0001605864 1.585224e-02 0.0127654678
## PC15 PC16 PC17 PC18 PC19
## 1 0.0012916731 -0.018634127 0.0002603693 0.001980064 -0.002660186
## 2 0.0435021885 0.022570455 -0.0185994510 -0.000264414 0.004494277
## 3 0.0017600062 -0.008754142 0.0338874444 -0.034024806 -0.017966932
## 4 0.0016642738 -0.026328777 0.0120437145 -0.040533169 -0.011141913
## 5 0.0269196032 -0.006978850 -0.0021779620 -0.002438107 -0.006965289
## 6 -0.0161986990 -0.009510093 0.0224553375 0.039771249 -0.006066325
## 7 -0.0049242688 -0.002195093 0.0259747386 -0.024210392 0.003487952
## 8 0.0008313835 -0.017779019 0.0025486425 -0.026666551 -0.004019149
## 9 0.0020570195 -0.004555134 0.0074710045 0.009732350 -0.005855678
## PC20 PC21 PC22 PC23 PC24 GENENAME
## 1 0.0012394673 -0.0092387190 -0.0070551437 -0.000630666 0.002514286 PCDH7
## 2 0.0100987242 -0.0155825346 -0.0028594309 -0.020072186 -0.001655615 BMP10
## 3 -0.0199487981 -0.0167329406 -0.0060256662 0.005186767 -0.011830997 NPPB
## 4 0.0020671822 -0.0003647640 -0.0115427127 0.003982254 0.002701151 NPPA
## 5 0.0005184457 -0.0074043455 -0.0049424451 -0.005484195 -0.002987105 KCNV2
## 6 -0.0074759431 0.0008165151 -0.0162637746 0.017510515 -0.017375019 MYL7
## 7 -0.0078558964 -0.0058403386 0.0005389701 -0.006758917 -0.005072034 KIF18B
## 8 0.0001209625 -0.0034348392 -0.0029844971 -0.011504834 0.005912755 KCNJ5
## 9 0.0021067526 -0.0019574276 -0.0020470672 -0.001530989 0.002629899 KCNK3
# Example dimensions for PCA plotting
dimensions <- list(c("PC1", "PC2"), c("PC1", "PC3"), c("PC2", "PC3"))
# Generate and display the PCA plots with probability ellipses
generate_and_save_pca_plots(
pca_df = pca_scores_healthy,
loadings_df = specific_loadings_df,
dimensions = dimensions,
colors = chamber_colors,
add_ellipse = FALSE # Toggle to TRUE to include ellipses
)
### Extract variance explained by each component
# Extract the standard deviation of each principal component
pca_sdev <- pca_healthy$sdev
# Calculate the variance explained by each component
variance_explained <- (pca_sdev^2) / sum(pca_sdev^2)
# Convert to a data frame for easier viewing
variance_explained_df <- data.frame(
Principal_Component = paste0("PC", 1:length(variance_explained)),
Variance_Explained = variance_explained,
Cumulative_Variance = cumsum(variance_explained)
)
# Print variance explained by each component
print(variance_explained_df)
## Principal_Component Variance_Explained Cumulative_Variance
## 1 PC1 5.697720e-01 0.5697720
## 2 PC2 7.518616e-02 0.6449582
## 3 PC3 6.488271e-02 0.7098409
## 4 PC4 3.657129e-02 0.7464122
## 5 PC5 3.515831e-02 0.7815705
## 6 PC6 2.627400e-02 0.8078445
## 7 PC7 2.211154e-02 0.8299560
## 8 PC8 2.115299e-02 0.8511090
## 9 PC9 1.938149e-02 0.8704905
## 10 PC10 1.756410e-02 0.8880546
## 11 PC11 1.653983e-02 0.9045944
## 12 PC12 1.603807e-02 0.9206325
## 13 PC13 1.497715e-02 0.9356097
## 14 PC14 1.429256e-02 0.9499022
## 15 PC15 1.346297e-02 0.9633652
## 16 PC16 1.249215e-02 0.9758573
## 17 PC17 1.222929e-02 0.9880866
## 18 PC18 1.191337e-02 1.0000000
## 19 PC19 1.227215e-30 1.0000000
## 20 PC20 5.261321e-31 1.0000000
## 21 PC21 3.357265e-31 1.0000000
## 22 PC22 3.294082e-31 1.0000000
## 23 PC23 2.519813e-31 1.0000000
## 24 PC24 1.340784e-31 1.0000000
# create contrasts to test
colnames(design)
## [1] "LA_AF" "LA_Control" "LV_AF" "LV_Control" "RA_AF"
## [6] "RA_Control" "RV_AF" "RV_Control"
con <- makeContrasts(AF_vs_control_RA = RA_AF - RA_Control,
AF_vs_control_LA = LA_AF - LA_Control,
RA_vs_LA_control = RA_Control - LA_Control,
RA_vs_LA_AF = RA_AF - LA_AF,
InteractionEffect_Atrial = (RA_AF - RA_Control) - (LA_AF - LA_Control),
Avr.Dis.Effect_Atrial = (LA_AF + RA_AF)/2 - (LA_Control + RA_Control)/2,
AverageRegionEffect_Atrial = (RA_Control + RA_AF)/2 - (LA_Control + LA_AF)/2,
AF_vs_control_RV = RV_AF - RV_Control,
AF_vs_control_LV = LV_AF - LV_Control,
RV_vs_LV_control = RV_Control - LV_Control,
RV_vs_LV_AF = RV_AF - LV_AF,
InteractionEffect_Ventricular = (RV_AF - RV_Control) - (LV_AF - LV_Control),
Avr.Dis.Effect_Ventricular = (LV_AF + RV_AF)/2 - (LV_Control + RV_Control)/2,
AverageRegionEffect_Ventricular = (RV_Control + RV_AF)/2 - (LV_Control + LV_AF)/2,
AverageRegionEffect_AV_healthy = (RA_Control + LA_Control)/2 - (RV_Control + LV_Control)/2,
levels = design)
con
## Contrasts
## Levels AF_vs_control_RA AF_vs_control_LA RA_vs_LA_control RA_vs_LA_AF
## LA_AF 0 1 0 -1
## LA_Control 0 -1 -1 0
## LV_AF 0 0 0 0
## LV_Control 0 0 0 0
## RA_AF 1 0 0 1
## RA_Control -1 0 1 0
## RV_AF 0 0 0 0
## RV_Control 0 0 0 0
## Contrasts
## Levels InteractionEffect_Atrial Avr.Dis.Effect_Atrial
## LA_AF -1 0.5
## LA_Control 1 -0.5
## LV_AF 0 0.0
## LV_Control 0 0.0
## RA_AF 1 0.5
## RA_Control -1 -0.5
## RV_AF 0 0.0
## RV_Control 0 0.0
## Contrasts
## Levels AverageRegionEffect_Atrial AF_vs_control_RV AF_vs_control_LV
## LA_AF -0.5 0 0
## LA_Control -0.5 0 0
## LV_AF 0.0 0 1
## LV_Control 0.0 0 -1
## RA_AF 0.5 0 0
## RA_Control 0.5 0 0
## RV_AF 0.0 1 0
## RV_Control 0.0 -1 0
## Contrasts
## Levels RV_vs_LV_control RV_vs_LV_AF InteractionEffect_Ventricular
## LA_AF 0 0 0
## LA_Control 0 0 0
## LV_AF 0 -1 -1
## LV_Control -1 0 1
## RA_AF 0 0 0
## RA_Control 0 0 0
## RV_AF 0 1 1
## RV_Control 1 0 -1
## Contrasts
## Levels Avr.Dis.Effect_Ventricular AverageRegionEffect_Ventricular
## LA_AF 0.0 0.0
## LA_Control 0.0 0.0
## LV_AF 0.5 -0.5
## LV_Control -0.5 -0.5
## RA_AF 0.0 0.0
## RA_Control 0.0 0.0
## RV_AF 0.5 0.5
## RV_Control -0.5 0.5
## Contrasts
## Levels AverageRegionEffect_AV_healthy
## LA_AF 0.0
## LA_Control 0.5
## LV_AF 0.0
## LV_Control -0.5
## RA_AF 0.0
## RA_Control 0.5
## RV_AF 0.0
## RV_Control -0.5
voomLmFit# V is the result of my linear model fitted using voom transformation
y_raw <- d[keep, ,keep.lib.sizes=FALSE]
# We created the y_raw, but we will use our TMM-normalised data instead = run y instead
v <- voomLmFit(counts = y,
design = design,
block = as.factor(y$samples$Horse),
sample.weights = TRUE,
plot = TRUE)
## First sample weights (min/max) 0.2817808/1.6829927
## First intra-block correlation 0.2269787
## Final sample weights (min/max) 0.2815015/1.6953994
## Final intra-block correlation 0.2269201
res <- list() # list for DGE results
for (i in colnames(con)) {
fit <- contrasts.fit(v, contrasts = con)
fit <- eBayes(fit, robust = TRUE)
res[[i]] <- topTable(fit, coef = i, number = Inf)
res[[i]] <- data.frame(res[[i]], Contrast = i)
n <- res[[i]] %>% dplyr::filter(adj.P.Val < 0.05) %>% nrow
print(paste('number of DE genes for',i, '=', n))
}
## [1] "number of DE genes for AF_vs_control_RA = 1330"
## [1] "number of DE genes for AF_vs_control_LA = 714"
## [1] "number of DE genes for RA_vs_LA_control = 520"
## [1] "number of DE genes for RA_vs_LA_AF = 169"
## [1] "number of DE genes for InteractionEffect_Atrial = 0"
## [1] "number of DE genes for Avr.Dis.Effect_Atrial = 2076"
## [1] "number of DE genes for AverageRegionEffect_Atrial = 847"
## [1] "number of DE genes for AF_vs_control_RV = 268"
## [1] "number of DE genes for AF_vs_control_LV = 60"
## [1] "number of DE genes for RV_vs_LV_control = 1299"
## [1] "number of DE genes for RV_vs_LV_AF = 0"
## [1] "number of DE genes for InteractionEffect_Ventricular = 109"
## [1] "number of DE genes for Avr.Dis.Effect_Ventricular = 207"
## [1] "number of DE genes for AverageRegionEffect_Ventricular = 591"
## [1] "number of DE genes for AverageRegionEffect_AV_healthy = 7889"
res_all <- do.call(rbind, res)
# Create output Excel file
res_cleaned <- lapply(res, function(df) {
names(df)[names(df) == "GeneName"] <- "Ensemblname" # or remove it if redundant
df
})
openxlsx::write.xlsx(x = res_cleaned, file = "../output/dge_results.xlsx", asTable = TRUE)
# Create output TSV file
data.table::fwrite(x = res_all, file = "../output/dge_results.tsv.gz", sep = "\t")
ggplot(res_all, aes(x = P.Value)) +
geom_histogram(fill = "lightgray",
color = "black",
breaks = seq(0, 1, by = 0.05),
closed = "right",
lwd = 0.2) +
facet_wrap(~ Contrast, nrow = 3, scales = "free") +
theme_bw()
volcano_plots <- list()
for (i in names(res)){
volcano_plots[[i]] <- ggVolcano(x = res[[i]],
fdr = 0.05,
fdr.column = "adj.P.Val",
pvalue.column = "P.Value",
logFC = 0,
logFC.column = "logFC",
text.size = 2) +
theme_bw(base_size = 10) +
ggtitle(i)
}
## Warning in max(x[get(fdr.column) <= .fdr][, get(pvalue.column)]): no
## non-missing arguments to max; returning -Inf
## Warning in max(x[get(fdr.column) <= .fdr][, get(pvalue.column)]): no
## non-missing arguments to max; returning -Inf
# Combine all volcano plots into a single layout with 3 columns.
patchwork::wrap_plots(volcano_plots, ncol = 3)
# Print individual volcano plots for key contrasts.
print(volcano_plots[["RA_vs_LA_control"]])
print(volcano_plots[["RV_vs_LV_control"]])
print(volcano_plots[["AverageRegionEffect_AV_healthy"]])
# Create a combined plot of the key contrasts for easier comparison.
combined_plot <- ((volcano_plots[["RA_vs_LA_control"]]) | (volcano_plots[["AverageRegionEffect_AV_healthy"]]))
print(combined_plot)
### Volcano Plot Custom
regions <- unique(healthy_meta$Region)
# Gene Name Mapping for Volcano Plots in RNA-seq Analysis
# Check if the annotation dataframe has the necessary columns
if (!"ENSEMBL" %in% colnames(annot_reordered) || !"GENENAME" %in% colnames(annot_reordered)) {
stop("Error: The annotation dataframe must contain both 'ENSEMBL' and 'GENENAME' columns.")
}
# Create a named vector for ENSEMBL to GENENAME mapping
ensembl_to_genename <- setNames(annot_reordered$GENENAME, annot_reordered$ENSEMBL)
# Map ENSEMBL IDs to GeneNames in the res list
for (contrast_name in names(res)) {
# Ensure the dataframe has ENSEMBL IDs as rownames
if (!"ENSEMBL" %in% colnames(res[[contrast_name]])) {
res[[contrast_name]]$ENSEMBL <- rownames(res[[contrast_name]])
}
# Map GENENAMEs to the dataframe using ENSEMBL IDs
res[[contrast_name]]$GENENAME <- ensembl_to_genename[res[[contrast_name]]$ENSEMBL]
# Replace NA values in GENENAME with ENSEMBL IDs (to ensure plotting works even if some gene names are missing)
res[[contrast_name]]$GENENAME[is.na(res[[contrast_name]]$GENENAME)] <- res[[contrast_name]]$ENSEMBL[is.na(res[[contrast_name]]$GENENAME)]
}
# Source the helper function
source("volcano_helpers.R")
# Create lists to store both versions of volcano plots
volcano_plots_no_labels <- list()
volcano_plots_with_labels <- list()
# Iterate over each contrast in `res` and create custom volcano plots with/without labels
for (contrast_name in names(res)) {
# Ensure the GeneName column is present in the dataframe for labeling
if (!"GeneName" %in% colnames(res[[contrast_name]])) {
# Map ENSEMBL to GeneName using the preprocessed mapping vector
res[[contrast_name]]$GeneName <- sapply(rownames(res[[contrast_name]]), function(x) gsub(".*_", "", x))
}
# Generate volcano plots with and without labels using the helper function
volcano_plots <- create_custom_volcano_plot(
df = res[[contrast_name]],
logFC_col = "logFC",
pvalue_col = "P.Value",
adj_pvalue_col = "adj.P.Val",
contrast_name = contrast_name,
fc_cutoff = 1,
pvalue_cutoff = 0.05,
save_plot = TRUE,
output_path = "../output/",
show_labels = TRUE # Always generate both labeled and unlabeled versions
)
# Store the plots in separate lists
volcano_plots_no_labels[[contrast_name]] <- volcano_plots$No_Labels
volcano_plots_with_labels[[contrast_name]] <- volcano_plots$With_Labels
}
## Warning: ggrepel: 1149 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 624 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 407 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 120 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1806 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 699 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 190 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1082 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 42 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 156 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 495 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 6684 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Combine and display the volcano plots without labels
patchwork::wrap_plots(volcano_plots_no_labels, ncol = 3)
# Print individual volcano plots with labels for key contrasts
print(volcano_plots_with_labels[["RA_vs_LA_control"]])
## Warning: ggrepel: 380 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
print(volcano_plots_no_labels[["RA_vs_LA_control"]])
print(volcano_plots_with_labels[["RV_vs_LV_control"]])
## Warning: ggrepel: 1021 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
print(volcano_plots_with_labels[["AverageRegionEffect_AV_healthy"]])
## Warning: ggrepel: 6607 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
print(volcano_plots_with_labels[["AF_vs_control_LA"]])
## Warning: ggrepel: 551 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
print(volcano_plots_with_labels[["AF_vs_control_RA"]])
## Warning: ggrepel: 1091 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
### Volcano Plot Custom (Revision)
regions <- unique(healthy_meta$Region)
# Gene Name Mapping for Volcano Plots in RNA-seq Analysis
# Check if the annotation dataframe has the necessary columns
if (!"ENSEMBL" %in% colnames(annot_reordered) || !"GENENAME" %in% colnames(annot_reordered)) {
stop("Error: The annotation dataframe must contain both 'ENSEMBL' and 'GENENAME' columns.")
}
# Create a named vector for ENSEMBL to GENENAME mapping
ensembl_to_genename <- setNames(annot_reordered$GENENAME, annot_reordered$ENSEMBL)
# Map ENSEMBL IDs to GeneNames in the res list
for (contrast_name in names(res)) {
# Ensure the dataframe has ENSEMBL IDs as rownames
if (!"ENSEMBL" %in% colnames(res[[contrast_name]])) {
res[[contrast_name]]$ENSEMBL <- rownames(res[[contrast_name]])
}
# Map GENENAMEs to the dataframe using ENSEMBL IDs
res[[contrast_name]]$GENENAME <- ensembl_to_genename[res[[contrast_name]]$ENSEMBL]
# Replace NA values in GENENAME with ENSEMBL IDs (to ensure plotting works even if some gene names are missing)
res[[contrast_name]]$GENENAME[is.na(res[[contrast_name]]$GENENAME)] <- res[[contrast_name]]$ENSEMBL[is.na(res[[contrast_name]]$GENENAME)]
}
# Source the helper function
source("volcano_helpers.R")
# Create lists to store both versions of volcano plots
volcano_plots_no_labels <- list()
volcano_plots_with_labels <- list()
# Iterate over each contrast in `res` and create custom volcano plots with/without labels
for (contrast_name in names(res)) {
# Ensure the GeneName column is present in the dataframe for labeling
if (!"GeneName" %in% colnames(res[[contrast_name]])) {
# Map ENSEMBL to GeneName using the preprocessed mapping vector
res[[contrast_name]]$GeneName <- sapply(rownames(res[[contrast_name]]), function(x) gsub(".*_", "", x))
}
# Generate volcano plots with and without labels using the helper function
volcano_plots <- create_custom_volcano_plot(
df = res[[contrast_name]],
logFC_col = "logFC",
pvalue_col = "P.Value",
adj_pvalue_col = "adj.P.Val",
contrast_name = contrast_name,
fc_cutoff = 0,
pvalue_cutoff = 0.05,
save_plot = TRUE,
output_path = "../output/",
show_labels = TRUE # Always generate both labeled and unlabeled versions
)
# Store the plots in separate lists
volcano_plots_no_labels[[contrast_name]] <- volcano_plots$No_Labels
volcano_plots_with_labels[[contrast_name]] <- volcano_plots$With_Labels
}
## Warning: ggrepel: 1149 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 624 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 407 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 120 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1806 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 699 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 190 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1082 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 42 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 156 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 495 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 6684 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Combine and display the volcano plots without labels
patchwork::wrap_plots(volcano_plots_no_labels, ncol = 3)
# Print individual volcano plots with labels for key contrasts
print(volcano_plots_with_labels[["RA_vs_LA_control"]])
## Warning: ggrepel: 380 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
print(volcano_plots_no_labels[["RA_vs_LA_control"]])
print(volcano_plots_with_labels[["RV_vs_LV_control"]])
## Warning: ggrepel: 1021 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
print(volcano_plots_with_labels[["AverageRegionEffect_AV_healthy"]])
## Warning: ggrepel: 6607 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
print(volcano_plots_with_labels[["AF_vs_control_LA"]])
## Warning: ggrepel: 551 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
print(volcano_plots_with_labels[["AF_vs_control_RA"]])
## Warning: ggrepel: 1091 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
This script identifies genes detected specifically or exclusively in different heart regions (left/right atria and ventricles) using the Bgee pipeline prescense/absence call.
# This analysis identifies genes detected specifically or exclusively in different heart regions
# (left/right atria and ventricles) using the Bgee pipeline presence/absence calls.
# Step 1: Read data based on bgee-call workflow performed on uCloud directly on raw-libraries
bgee <- fread("../../../data/count/merged_calls_bgee.tsv")
## Warning in fread("../../../data/count/merged_calls_bgee.tsv"): Detected 48
## column names but the data has 49 columns (i.e. invalid file). Added 1 extra
## default column name for the first column which is guessed to be row names or an
## index. Use setnames() afterwards if this guess is not correct, or fix the file
## write command that created the file to create a valid file.
bgee <- bgee %>%
rename(GeneID = V1)
# Step 2: Filter metadata for healthy samples
meta_reordered <- meta_reordered %>%
filter(Group == "Control") # Only healthy samples
meta_reordered <- meta_reordered %>%
mutate(Sample = rownames(meta_reordered)) # Add a 'Sample' column for mapping
# Step 3: Subset bgee to include only healthy samples
healthy_samples <- rownames(meta_reordered) # Row names in meta_reordered are the sample names
healthy_columns <- c("GeneID", healthy_samples) # Keep GeneID and healthy sample columns
bgee_healthy <- bgee %>%
select(all_of(healthy_columns)) # Subset bgee using healthy sample columns
# Step 4: Reshape bgee_healthy into long format and map regions
bgee_long <- bgee_healthy %>%
pivot_longer(-GeneID, names_to = "Sample", values_to = "Presence") %>%
left_join(meta_reordered %>% select(Sample, Region), by = "Sample") %>% # Map regions
mutate(Presence = as.numeric(Presence == "present")) # Convert "present"/"absent" to binary
# Step 5: Calculate the percentage of "present" samples per region
bgee_summary <- bgee_long %>%
group_by(GeneID, Region) %>%
summarize(PercentPresent = mean(Presence), .groups = "drop") # Proportion of "present" samples per region
# Step 6: Apply the 75% threshold for detection in a region
detection_threshold <- 0.75
binary_matrix <- bgee_summary %>%
mutate(IsPresent = PercentPresent >= detection_threshold) %>% # Binary presence/absence
pivot_wider(names_from = Region, values_from = IsPresent, values_fill = FALSE) %>% # Create binary matrix
filter(rowSums(select(., -GeneID)) > 0) %>% # Retain rows where at least one region is TRUE
distinct(GeneID, .keep_all = TRUE) %>% # Remove duplicates
mutate(across(-GeneID, as.numeric)) # Convert logical values to numeric
binary_matrix <- binary_matrix %>%
select(-PercentPresent) # Remove the PercentPresent column
# Step 7: Annotate binary_matrix
annot_reordered <- annot_reordered %>%
distinct(ENSEMBL, .keep_all = TRUE) # Keep only unique rows by ENSEMBL
binary_matrix_annot <- binary_matrix %>%
left_join(annot_reordered[, c("ENSEMBL", "GENENAME")], by = c("GeneID" = "ENSEMBL")) %>%
mutate(GENENAME = if_else(is.na(GENENAME), GeneID, GENENAME)) %>%
distinct(GeneID, .keep_all = TRUE) # Ensure no duplicates after annotation
# Step 8: Update binary matrix for strict inclusion and exclusion
binary_matrix_strict <- binary_matrix_annot %>%
mutate(
Atrial_Specific = if_all(all_of(c("LA", "RA")), ~ . == 1) & if_all(all_of(c("LV", "RV")), ~ . == 0),
Ventricular_Specific = if_all(all_of(c("LV", "RV")), ~ . == 1) & if_all(all_of(c("LA", "RA")), ~ . == 0),
LA_Only = (LA == 1) & if_all(all_of(c("RA", "LV", "RV")), ~ . == 0),
RA_Only = (RA == 1) & if_all(all_of(c("LA", "LV", "RV")), ~ . == 0),
LV_Only = (LV == 1) & if_all(all_of(c("LA", "RA", "RV")), ~ . == 0),
RV_Only = (RV == 1) & if_all(all_of(c("LA", "RA", "LV")), ~ . == 0),
Left_Side_Specific = if_all(all_of(c("LA", "LV")), ~ . == 1) & if_all(all_of(c("RA", "RV")), ~ . == 0),
Right_Side_Specific = if_all(all_of(c("RA", "RV")), ~ . == 1) & if_all(all_of(c("LA", "LV")), ~ . == 0),
All_Four_Regions = if_all(all_of(c("LA", "RA", "LV", "RV")), ~ . == 1)
) %>%
select(GeneID, Atrial_Specific, Ventricular_Specific, LA_Only, RA_Only, LV_Only, RV_Only, Left_Side_Specific, Right_Side_Specific, All_Four_Regions) %>%
filter(rowSums(select(., -GeneID)) > 0) # Retain genes that meet criteria in at least one category
# Generate the UpSet plot
upset_plot <- ComplexUpset::upset(
binary_matrix_strict,
colnames(binary_matrix_strict)[-1], # Use the updated region-specific categories
name = "Region-Specific Genes",
intersections = "all", # Show all intersections
min_size = 5, # Minimum number of genes per intersection
width_ratio = 0.2,
height_ratio = 0.4,
themes = upset_default_themes(panel.grid.major = element_blank())
)
# Customize and display the plot
upset_plot <- upset_plot +
ggtitle("Region-Specific Genes in Healthy Horse Hearts (Strict Logic)") +
theme(plot.title = element_text(hjust = 0.5, size = 16, face = "bold"))
print(upset_plot)
# Save the plot
ggsave(
filename = "../output/Detected_Genes_Intersections_UpSet_BGEE.png",
plot = upset_plot,
width = 6, height = 4, dpi = 600
)
# Step 9: Extract genes for each intersection
gene_lists <- list()
# Define updated intersection logic
intersections <- list(
"LA_Only" = list(include = c("LA"), exclude = c("RA", "LV", "RV")),
"RA_Only" = list(include = c("RA"), exclude = c("LA", "LV", "RV")),
"LV_Only" = list(include = c("LV"), exclude = c("LA", "RA", "RV")),
"RV_Only" = list(include = c("RV"), exclude = c("LA", "RA", "LV")),
"Atrial_Specific" = list(include = c("LA", "RA"), exclude = c("LV", "RV")),
"Ventricular_Specific" = list(include = c("LV", "RV"), exclude = c("LA", "RA")),
"Left_Side_Specific" = list(include = c("LA", "LV"), exclude = c("RA", "RV")),
"Right_Side_Specific" = list(include = c("RA", "RV"), exclude = c("LA", "LV")),
"All_Four_Regions" = list(include = c("LA", "RA", "LV", "RV"), exclude = c())
)
# Extract genes for each intersection
gene_lists <- list()
for (name in names(intersections)) {
include_regions <- intersections[[name]]$include
exclude_regions <- intersections[[name]]$exclude
# Enforce strict inclusion and exclusion criteria
genes_in_intersection <- binary_matrix_annot %>%
filter(if_all(all_of(include_regions), ~ . == 1)) %>% # Present in all included regions
filter(if_all(all_of(exclude_regions), ~ . == 0)) %>% # Absent in all excluded regions
select(GeneID, GENENAME)
# Store the results
gene_lists[[name]] <- genes_in_intersection
}
# Step 10: Save gene lists to Excel
output_file <- "../output/Detected_Genes_Intersections_Bgee.xlsx"
wb <- createWorkbook()
for (sheet_name in names(gene_lists)) {
addWorksheet(wb, sheet_name)
writeData(wb, sheet = sheet_name, x = gene_lists[[sheet_name]])
}
saveWorkbook(wb, output_file, overwrite = TRUE)
cat("Intersection lists successfully saved to:", output_file, "\n")
## Intersection lists successfully saved to: ../output/Detected_Genes_Intersections_Bgee.xlsx
sessionInfo()
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=Danish_Denmark.utf8 LC_CTYPE=Danish_Denmark.utf8
## [3] LC_MONETARY=Danish_Denmark.utf8 LC_NUMERIC=C
## [5] LC_TIME=Danish_Denmark.utf8
##
## time zone: Europe/Copenhagen
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tidyr_1.3.1 ComplexUpset_1.3.3 ggrepel_0.9.6 RColorBrewer_1.1-3
## [5] dplyr_1.1.4 devtools_2.4.5 usethis_3.0.0 openxlsx_4.2.7.1
## [9] patchwork_1.3.0 data.table_1.16.2 ggplot2_3.5.1 stringr_1.5.1
## [13] tibble_3.2.1 magrittr_2.0.3 biomaRt_2.62.0 readxl_1.4.3
## [17] readr_2.1.5 edgeR_4.4.0 limma_3.62.1 aamisc_0.1.5
## [21] pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.17.1 jsonlite_1.8.9 farver_2.1.2
## [4] rmarkdown_2.29 ragg_1.3.3 fs_1.6.5
## [7] zlibbioc_1.52.0 vctrs_0.6.5 multtest_2.62.0
## [10] memoise_2.0.1 htmltools_0.5.8.1 progress_1.2.3
## [13] curl_6.0.1 cellranger_1.1.0 sass_0.4.9
## [16] bslib_0.8.0 htmlwidgets_1.6.4 plyr_1.8.9
## [19] httr2_1.0.6 cachem_1.1.0 mime_0.12
## [22] lifecycle_1.0.4 pkgconfig_2.0.3 Matrix_1.7-1
## [25] R6_2.5.1 fastmap_1.2.0 GenomeInfoDbData_1.2.13
## [28] shiny_1.9.1 digest_0.6.37 HarmonicRegression_1.0
## [31] colorspace_2.1-1 AnnotationDbi_1.68.0 S4Vectors_0.44.0
## [34] pkgload_1.4.0 textshaping_0.4.0 RSQLite_2.3.8
## [37] labeling_0.4.3 filelock_1.0.3 fansi_1.0.6
## [40] httr_1.4.7 compiler_4.4.2 remotes_2.5.0
## [43] bit64_4.5.2 withr_3.0.2 rain_1.40.0
## [46] DBI_1.2.3 pkgbuild_1.4.5 R.utils_2.12.3
## [49] MASS_7.3-61 rappdirs_0.3.3 sessioninfo_1.2.2
## [52] tools_4.4.2 NISTunits_1.0.1 zip_2.3.1
## [55] httpuv_1.6.15 R.oo_1.27.0 glue_1.8.0
## [58] promises_1.3.0 grid_4.4.2 reshape2_1.4.4
## [61] generics_0.1.3 gtable_0.3.6 tzdb_0.4.0
## [64] R.methodsS3_1.8.2 hms_1.1.3 xml2_1.3.6
## [67] utf8_1.2.4 XVector_0.46.0 BiocGenerics_0.52.0
## [70] pillar_1.9.0 vroom_1.6.5 later_1.3.2
## [73] splines_4.4.2 BiocFileCache_2.14.0 lattice_0.22-6
## [76] survival_3.7-0 gmp_0.7-5 bit_4.5.0
## [79] tidyselect_1.2.1 locfit_1.5-9.10 Biostrings_2.74.0
## [82] miniUI_0.1.1.1 knitr_1.49 IRanges_2.40.0
## [85] stats4_4.4.2 xfun_0.49 Biobase_2.66.0
## [88] statmod_1.5.0 stringi_1.8.4 UCSC.utils_1.2.0
## [91] yaml_2.3.10 evaluate_1.0.1 qvalue_2.38.0
## [94] cli_3.6.3 systemfonts_1.1.0 xtable_1.8-4
## [97] munsell_0.5.1 jquerylib_0.1.4 Rcpp_1.0.13-1
## [100] GenomeInfoDb_1.42.0 dbplyr_2.5.0 png_0.1-8
## [103] parallel_4.4.2 ellipsis_0.3.2 blob_1.2.4
## [106] prettyunits_1.2.0 profvis_0.4.0 urlchecker_1.0.1
## [109] scales_1.3.0 purrr_1.0.2 crayon_1.5.3
## [112] rlang_1.1.4 KEGGREST_1.46.0